
get_tracts = function(y){
  out = lapply(c(state.abb, 'DC'), tracts, year = y, cb = T) %>%
    rbind_tigris()
  out$FIPS = str_c(out$STATEFP, out$COUNTYFP)
  out = out[,c(11,5,8,10)]
  out = split(out, ~FIPS)
  out = lapply(out, function(y) as(y, 'Spatial'))
  return(out)
}

tr16 = get_tracts(2016)
tr20 = get_tracts(2020)

geolocate_year = function(year_in, tracts){
  
  keep_tracts = names(tracts)
  
  out = open_dataset('data/final_long') %>%
    filter(cycle == year_in) %>%
    select(cycle,
           component,
           fips,
           census_id,
           site_lat,
           site_long,
           state_file,
           fips_l2,
           lat,
           long) %>%
    left_join(open_dataset('joining/state_fips.parq'),
              by = c('state_file' = 'state')) %>%
    mutate(
      resid_state_pad = str_pad(as.character(resid_state), 2, 'left', '0'),
      fips_l2_pad = str_pad(as.character(fips_l2), 3, 'left', '0')
    ) %>%
    mutate(fips_l2_full = str_c(resid_state_pad, fips_l2_pad)) %>%
    mutate(nchar_fips = nchar(fips)) %>%
    mutate(best_fips = case_when(
      !is.na(fips_l2_full) ~ fips_l2_full,
      !is.na(fips) & nchar_fips == 5 ~ fips,
      T ~ NA_character_
    )) %>%
    mutate(
      best_lat = case_when(!is.na(site_lat) ~ site_lat, !is.na(lat) ~ lat, T ~ NA_real_),
      best_long = case_when(!is.na(site_lat) ~ site_long, !is.na(lat) ~ long, T ~ NA_real_)
    ) %>%
    select(cycle, component, best_fips, best_lat, best_long) %>%
    filter(!is.na(best_fips),
           !is.na(best_lat),
           !is.na(best_long),
           best_fips %in% keep_tracts) %>%
    mutate(GEOID = NA_character_) %>%
    as.data.frame()
  
  setDT(out, key = 'best_fips')
  
  out = split(out, by = 'best_fips')
  
  out = lapply(out, function(xx) sp::`coordinates<-`(xx, c('best_long', 'best_lat')))
  out = lapply(out, function(xx) sp::`proj4string<-`(xx, proj4string(tracts[[1]])))
  
  for(i in names(out)){
    this = over(out[[i]], tracts[[i]])
    out[[i]]$GEOID = this$GEOID
  }
  
  out = rbindlist(lapply(out, function(xx)
    data.table(as.data.frame(xx[, c('cycle', 'component', 'GEOID')])[, 1:4])
    ))
  
  setkey(out, cycle, component)
  
  out = out[!is.na(GEOID)]
  
  return(out)
}

out12 = geolocate_year(2012, tr16)
out16 = geolocate_year(2016, tr16)
out20 = geolocate_year(2020, tr20)

# select best census tract -- use what's there if manual geolocation failed
# fallback: now pull obsv with census_id from CoreLogic, removing things we matched above

# 2012

all12 = open_dataset('data/final_long') %>%
  filter(cycle == 2012, !is.na(fips), !is.na(census_id)) %>%
  transmute(cycle, component, GEOID = str_c(fips, str_sub(census_id, 1, 6))) %>%
  filter(nchar(GEOID) == 11) %>%
  as.data.frame()

setDT(all12)
all12 = all12[!out12, on = .(cycle, component)]
out12 = rbindlist(list(out12, all12), use.names = T)

# 2016

all16 = open_dataset('data/final_long') %>%
  filter(cycle == 2016, !is.na(fips), !is.na(census_id)) %>%
  transmute(cycle, component, GEOID = str_c(fips, str_sub(census_id, 1, 6))) %>%
  filter(nchar(GEOID) == 11) %>%
  collect() %>%
  as.data.frame()

setDT(all16)
all16 = all16[!out16, on = .(cycle, component)]
out16 = rbindlist(list(out16, all16), use.names = T)

# 2020

all20 = open_dataset('data/final_long') %>%
  filter(cycle == 2020, !is.na(fips), !is.na(census_id)) %>%
  transmute(cycle, component, GEOID = str_c(fips, str_sub(census_id, 1, 6))) %>%
  filter(nchar(GEOID) == 11) %>%
  collect() %>%
  as.data.frame()

setDT(all20)
all20 = all20[!out20, on = .(cycle, component)]
out20 = rbindlist(list(out20, all20), use.names = T)

# write

out = rbindlist(list(out12, out16, out20), use.names = T)

write_dataset(out, 'data/geo_located/')
